## boxplot for asthma rates for states with high and low asthma prev
ggplot(df, aes(x = adj_prevalence)) + geom_histogram() + geom_density() + theme_classic() +
labs( x = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
y = "Count",
title = "Distribution of Asthma Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplot for ozone rates for state with high and low asthma prev
ggplot(df, aes(x = yearly_avg_ozone)) + geom_histogram() + geom_density() + theme_classic() +
labs( x = "Yearly Average Ozone Pollution",
y = "Count",
title = "Distribution of Ozone Pollution for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplot for pm25 rates for state with high and low asthma prev
ggplot(df, aes(x = yearly_avg_pm25)) + geom_histogram() + geom_density() + theme_classic() +
labs( x = "Yearly Average PM 2.5 Pollution",
y = "Count",
title = "Distribution of PM 2.5 Pollution for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplot for obesity rates for state with high and low asthma prev
ggplot(df, aes(x = adj_obesity)) + geom_histogram() + geom_density() + theme_classic() +
labs( x = "Age-adjusted Obesity Prevalence (>= 18 Years Old)",
y = "Count",
title = "Distribution of Obesity Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplot for smoking rates for state with high and low asthma prev
ggplot(df, aes(x = adj_smoking)) + geom_histogram() + geom_density() + theme_classic() +
labs( x = "Age-adjusted Smoking Prevalence (>= 18 Years Old)",
y = "Count",
title = "Distribution of Smoking Prevalence for Nearly Every County in the US")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplot for asthma rates for states with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_prevalence, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
theme(legend.position = "none") +
labs( x = "Regions Across the United States",
y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
title = "Distribution of Asthma Prevalence Across United States Regions")
## boxplot for ozone rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = yearly_avg_ozone, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
theme(legend.position = "none") +
labs( x = "Regions Across the United States",
y = "2016 Year Average Ozone Pollution",
title = "Distribution of Ozone Pollution Prevalence Across United States Regions")
## boxplot for pm25 rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = yearly_avg_pm25, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
theme(legend.position = "none") +
labs( x = "Regions Across the United States",
y = "2016 Year Average PM 2.5 Pollution",
title = "Distribution of PM 2.5 Pollution Prevalence Across United States Regions")
## boxplot for obesity rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_obesity, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
theme(legend.position = "none") +
labs( x = "Regions Across the United States",
y = "Age-adjusted Obesity Prevalence (>= 18 Years Old)",
title = "Distribution of Ozone Pollution Prevalence Across United States Regions")
## boxplot for smoking rates for state with high and low asthma prev
ggplot(df, aes(x = Region_Name, y = adj_smoking, fill = Region_Name)) + geom_boxplot() + scale_fill_brewer(palette = "Set1") + theme_classic() +
theme(legend.position = "none") +
labs( x = "Regions Across the United States",
y = "Age-adjusted Smoking Prevalence (>= 18 Years Old)",
title = "Distribution of Ozone Pollution Prevalence Across United States Regions")
## Asthma Prevalence and Ozone
ggplot(df, aes(x = yearly_avg_ozone, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() + #geom_density_2d_filled()
theme(legend.title = element_blank(), legend.position = "bottom") +
labs( x = "2016 Year Average Ozone Pollution",
y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
title = "Scatter Plot of Ozone Pollution And Asthma Prevalence")
## Asthma Prevalence and PM25
ggplot(df, aes(x = yearly_avg_pm25, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() + #geom_density_2d_filled()
theme(legend.title = element_blank(), legend.position = "bottom") +
labs( x = "2016 Year Average PM 2.5 Pollution",
y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
title = "Scatter Plot of PM 2.5 Pollution And Asthma Prevalence")
## Asthma Prevalence and Smoking
ggplot(df, aes(x = adj_smoking, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() + #geom_density_2d_filled()
theme(legend.title = element_blank(), legend.position = "bottom") +
labs( x = "Age-adusted Smoking Prevalence (>= 18 Years Old)",
y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
title = "Scatter Plot of Smoking Prevalence And Asthma Prevalence")
## Asthma Prevalence and Obesity
ggplot(df, aes(x = adj_obesity, y = adj_prevalence, color = Region_Name)) + geom_point() + scale_colour_brewer(palette = "Set1") + theme_classic() + #geom_density_2d_filled()
theme(legend.title = element_blank(), legend.position = "bottom") +
labs( x = "Age-adusted Obesity Prevalence (>= 18 Years Old)",
y = "Age-adjusted Asthma Prevalence (>= 18 Years Old)",
title = "Scatter Plot of Obesity Prevalence And Asthma Prevalence")
The model that is going to be use for the project includes Mod2b.
By comparing the AIC values (generally want a lower value), comparison of the values indicates the smoking model is the singular most influential factor for predicting asthma prevalence. This is further supported by enormous F value for that model. Given the availability of additional risk factors, multivariate models will be included. The residual plots for this model also line up appropriately/ideally.
ozone.lm = lm(adj_prevalence ~ yearly_avg_ozone, data = df)
summary.aov(ozone.lm)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.096 30.36 3.92e-08 ***
## Residuals 2818 2793.7 0.991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(ozone.lm)
## [1] 7982.367
plot(ozone.lm)
pm25.lm = lm(adj_prevalence ~ yearly_avg_pm25, data = df)
summary.aov(pm25.lm)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_pm25 1 123 123.04 128.4 <2e-16 ***
## Residuals 2818 2701 0.96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(pm25.lm)
## [1] 7886.956
plot(pm25.lm)
obesity.lm = lm(adj_prevalence ~ adj_obesity, data = df)
summary.aov(obesity.lm)
## Df Sum Sq Mean Sq F value Pr(>F)
## adj_obesity 1 468.1 468.1 560 <2e-16 ***
## Residuals 2818 2355.6 0.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(obesity.lm)
## [1] 7501.43
plot(obesity.lm)
smoking.lm = lm(adj_prevalence ~ adj_smoking, data = df)
summary.aov(smoking.lm)
## Df Sum Sq Mean Sq F value Pr(>F)
## adj_smoking 1 1455 1454.6 2994 <2e-16 ***
## Residuals 2818 1369 0.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(smoking.lm)
## [1] 5971.25
plot(smoking.lm)
The initial plan was preform backwards and then forward selection, however the first backward selection mode, all variables came back significant. This could be a function of having multiple data points, however regardless, then the interactions between the individual variables were explored.
The diagnostic plots for the multivariate models fit more appropriately than the any of the uni-variate models. The least complicated model with an idea AIC score is the the model that include the interactions between regions and obesity. This decision took consideration the diagnostics plots and the AIC values.
mod1 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity + adj_smoking + Region_Name, data = df)
#summary(mod1)
summary.aov(mod1)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 110.8 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 379.7 <2e-16 ***
## adj_obesity 1 405.1 405.1 1492.2 <2e-16 ***
## adj_smoking 1 927.4 927.4 3415.6 <2e-16 ***
## Region_Name 7 595.7 85.1 313.4 <2e-16 ***
## Residuals 2808 762.4 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 4340.202
plot(mod1)
mod2a = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity+ Region_Name * adj_smoking, data = df)
#summary(mod2a)
summary.aov(mod2a)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 122.28 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 418.87 <2e-16 ***
## adj_obesity 1 405.1 405.1 1646.02 <2e-16 ***
## Region_Name 7 798.8 114.1 463.65 <2e-16 ***
## adj_smoking 1 724.2 724.2 2942.25 <2e-16 ***
## Region_Name:adj_smoking 7 73.0 10.4 42.36 <2e-16 ***
## Residuals 2801 689.4 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2a)
## [1] 4070.463
plot(mod2a)
mod2b = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * adj_obesity, data = df)
#summary(mod2b)
summary.aov(mod2b)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 117.20 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 401.48 <2e-16 ***
## adj_smoking 1 1330.5 1330.5 5181.07 <2e-16 ***
## Region_Name 7 554.0 79.1 308.21 <2e-16 ***
## adj_obesity 1 43.6 43.6 169.94 <2e-16 ***
## Region_Name:adj_obesity 7 43.1 6.2 23.98 <2e-16 ***
## Residuals 2801 719.3 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2b)
## [1] 4190.079
plot(mod2b)
mod2c = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * adj_smoking + Region_Name * adj_obesity, data = df)
#summary(mod2c)
summary.aov(mod2c)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 136.27 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 466.81 <2e-16 ***
## adj_smoking 1 1330.5 1330.5 6024.12 <2e-16 ***
## Region_Name 7 554.0 79.1 358.36 <2e-16 ***
## adj_obesity 1 43.6 43.6 197.59 <2e-16 ***
## adj_smoking:Region_Name 7 73.0 10.4 47.20 <2e-16 ***
## Region_Name:adj_obesity 7 72.3 10.3 46.79 <2e-16 ***
## Residuals 2794 617.1 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2c)
## [1] 3771.881
plot(mod2c)
mod2d = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + Region_Name * log(adj_smoking) + Region_Name * adj_obesity, data = df)
#summary(mod2d)
summary.aov(mod2d)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 137.53 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 471.12 <2e-16 ***
## adj_smoking 1 1330.5 1330.5 6079.78 <2e-16 ***
## Region_Name 7 554.0 79.1 361.67 <2e-16 ***
## log(adj_smoking) 1 25.9 25.9 118.13 <2e-16 ***
## adj_obesity 1 47.5 47.5 217.07 <2e-16 ***
## Region_Name:log(adj_smoking) 7 51.7 7.4 33.75 <2e-16 ***
## Region_Name:adj_obesity 7 69.8 10.0 45.54 <2e-16 ***
## Residuals 2793 611.2 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod2d)
## [1] 3746.935
plot(mod2d)
mod3 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_obesity + adj_smoking + state_desc, data = df)
#summary(mod3)
summary.aov(mod3)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 264.8 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 907.1 <2e-16 ***
## adj_obesity 1 405.1 405.1 3564.6 <2e-16 ***
## adj_smoking 1 927.4 927.4 8159.2 <2e-16 ***
## state_desc 42 1042.9 24.8 218.5 <2e-16 ***
## Residuals 2773 315.2 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod3)
## [1] 1919.159
plot(mod3)
## Warning: not plotting observations with leverage one:
## 4
mod4 = lm(adj_prevalence ~ yearly_avg_ozone + yearly_avg_pm25 + adj_smoking + state_desc * adj_obesity, data = df)
#summary(mod4)
summary.aov(mod4)
## Df Sum Sq Mean Sq F value Pr(>F)
## yearly_avg_ozone 1 30.1 30.1 310.39 <2e-16 ***
## yearly_avg_pm25 1 103.1 103.1 1063.31 <2e-16 ***
## adj_smoking 1 1330.5 1330.5 13721.95 <2e-16 ***
## state_desc 42 1025.0 24.4 251.71 <2e-16 ***
## adj_obesity 1 19.9 19.9 204.76 <2e-16 ***
## state_desc:adj_obesity 41 50.3 1.2 12.65 <2e-16 ***
## Residuals 2732 264.9 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod4)
## [1] 1511.104
plot(mod4)
## Warning: not plotting observations with leverage one:
## 4
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced